Loading Packages & Initialization

In [2]:
rm(list=ls())

library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)

library(lhs) # For maximin Latin hypercube sampling
library(ggplot2)
library(plotly) # For beautiful plotting
library(caret)
library(randomForest)
library(factoextra)
library(e1071)
library(TSrepr) # for evaluating predictive power

require(gridExtra)

options(warn = -1)
In [3]:
# Select if data generation is wanted
GenerateTTData <- 0
In [4]:
Is_Headless <- 1
nl.model <- "Segregation"

nl.path <- "C:/Program Files/NetLogo 6.0.4/app"
model.path <- paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/",nl.model,".nlogo")

if (Is_Headless == 0){
    NLStart(nl.path, gui = TRUE,nl.jarname='netlogo-6.0.4.jar')
    NLLoadModel (model.path)
    } else {
    NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
    NLLoadModel (model.path, nl.obj = nl.model)
    
    #NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
    #NLLoadModel (model.path, nl.obj = nl.model )
    }

Model Parameters & Functions

Set model parameters

In [5]:
set.seed(1)

## Set model parameters
 # Number of replications for each instance
nofrep = 10   

# Number of iterations
iteration_budget = 20
 # order feature names according to their definition order in run_model
feature_names = c("density","%-similar-wanted")  
 # 
output_name = c("percent-similar")

 # Number of input parameters of the agent-based model
nofparams = length(feature_names)      

# set RF parameters
ntree = 400
mtry = 2

Set user parameters

In [6]:
error_type = "RMSE" # MAPE, BIAS

# choose the uncertainty measure
selection_metric <- "sd" #, "range" 

unlabeled_ins = 700 
test_ins = 400
train_ins_oneshot = 700
train_ins_Ad = 200

# Set selection parameters
selected_ins = 5 #nofinstancesWillbeSelected in each step

# Set elimination parameters
h <- 1 # number of variables eliminated in each step

Define functions

run_model

In [7]:
#run_model <- function(feature_names,feature_values){ # both should be in character list format
run_model <- function(feature_values){ # both should be in character list format

    
    k = length(feature_names)    
    for(i in 1:k){
        NLCommand(paste0("set ",feature_names[i]," ",feature_values[i]), nl.obj = nl.model)      
    }
    NLCommand("setup", nl.obj = nl.model)
    NLDoCommand(100, "go", nl.obj = nl.model) 
    result <- NLReport(output_name, nl.obj = nl.model)
    return(result)   
}

run_replicas

In [8]:
#run_replicas <- function(nofrep,feature_names,feature_values) {
run_replicas <- function(nofrep,feature_values) {
    replicas = matrix(NA, ncol = nofrep, nrow = 1) # Save the result of each replication
    for(i in 1:nofrep){
     #   replicas[i]= run_model(feature_names,feature_values)
        replicas[i]= run_model(feature_values)
    }
    aggregated_result = mean(replicas)
    return(aggregated_result)
}

run_ABM

In [9]:
#run_ABM = function(nofrep,nofinstances,unlabeledset,featurenames = feature_names){
run_ABM = function(nofrep,nofinstances,unlabeledset){
   #unlabeledset = setcolorder(unlabeledset,featurenames) 
   unlabeledset = setcolorder(unlabeledset,feature_names) 
   for(i in 1:nofinstances){
        #unlabeledset[i, output :=  run_replicas(nofrep,featurenames, as.matrix(unlabeledset[i,]))]    
        unlabeledset[i, output :=  run_replicas(nofrep, as.matrix(unlabeledset[i,]))] 
    } 
    return(unlabeledset)
}

error functions

In [10]:
#error functions on test data
rmse_func <- function(actual, predicted){
    error = predicted - actual
    return(sqrt(mean(error^2)))
}

mape_func <- function(actual,predicted){
    return( (abs(actual - predicted)/ actual)*100 )
}

bias_func <- function(actual,predicted){
    return( (actual - predicted)/ actual )
}

#error functions on train data
obb_error_func <- function(model){
   if(model$type == "regression"){
        oob_error = model$mse[model$ntree] 
    }else if(model$type == "classification"){
        oob_error = model$err.rate 
    } 
    return(oob_error)
}

get_test_predictions

In [11]:
# prediction functions
get_test_predictions <- function(model,testset,errortype){
    
    predictedLabels <- predict(model, testset)
    predictedLabels <- cbind(testset,predictedLabels)
    setnames(predictedLabels, "predictedLabels","pred_output")

    output_variables = colnames(select(predictedLabels, contains("output")))
    # output_variables[1] = true output
    # output_variables[2] = predicted output
    
    #output_variables = colnames(predictedLabels[,1:(ncol(predictedLabels) - 2)])
    
    if(error_type == "MAPE"){
        predictedLabels[,MAPE := mapply(function(x,y) mape_func(x,y),get(output_variables[1]),get(output_variables[2]))]
          }
    if(error_type == "RMSE"){
        predictedLabels[,RMSE := mapply(function(x,y) rmse_func(x,y),get(output_variables[1]),get(output_variables[2]))]
          }
    if(error_type == "BIAS"){
        predictedLabels[,BIAS := mapply(function(x,y) bias_func(x,y),get(output_variables[1]),get(output_variables[2]))]
           } 
                                  
     output_variables_1 = predictedLabels[,get(output_variables[1]), with = TRUE]
     output_variables_2 = predictedLabels[,get(output_variables[2]), with = TRUE]
    
     performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
     performance_temp[1] =  mae(output_variables_1 , output_variables_2)
     performance_temp[2] = rmse(output_variables_1 , output_variables_2)
     performance_temp[3] = mape(output_variables_1 , output_variables_2)
    
    return(list(predictedLabels,performance_temp,output_variables))
    
}

sample_selection

In [12]:
# Adaptive sample selection function with an uncertainty measure depending on "selection_metric"
sample_selection <- function(selected_ins,unlabeled_set,model){   
    ind_pred <- t(predict(model, unlabeled_set,predict.all = TRUE)$individual) %>% 
                data.table() # predictions by each tree in the forest
    ind_pred_eval = data.table()
    
    # standard deviation calculation
    s_dev = sapply(ind_pred, sd) %>% data.table()
    setnames(s_dev,".","sd")
    ind_pred_eval = cbind(ind_pred_eval,s_dev)
    
    # range calculation
    range = sapply(ind_pred, range) %>% t() %>% data.table()
    range = range[,.(range = abs(range[,1] - range[,2]))]
    setnames(range,"range.V1","range")
    ind_pred_eval = cbind(ind_pred_eval,range)
    
    ind_pred_eval[,idx := 1:.N]
    
    if(selection_metric == "sd") {
      ind_pred_eval = ind_pred_eval[order(-sd)][1:selected_ins]
    }else if(selection_metric == "range"){
      ind_pred_eval = ind_pred_eval[order(-range)][1:selected_ins]
    }
    
    unlabeled_set[,idx := 1:.N]    
    train_candidates = unlabeled_set[ind_pred_eval$idx]
    
    return(train_candidates)
}

random_sample_selection

In [13]:
# Random sample selection
random_sample_selection <- function(selected_ins,unlabeled_set){
  
    unlabeled_set[,idx := 1:.N]
    
    train_candidate_idx = sample(unlabeled_set$idx, selected_ins, replace = FALSE, prob = NULL)   
    train_candidates = unlabeled_set[idx %in% train_candidate_idx]
    
    return(train_candidates)
}

get_variable_importance

In [14]:
get_variable_importance <- function(model){
    importances <- importance(model, type = 1, scale = FALSE)
    selected.vars <- order(importances, decreasing = TRUE)
    ranked_features = feature_names[selected.vars]
    ordered.importances <- importances[selected.vars]
    
    return(ranked_features)
}

feature_elimination

In [15]:
feature_elimination <- function(h,total_numof_eliminated_vars,ranked_features){ 
    numof_columns_left = length(ranked_features) - (total_numof_eliminated_vars + h)
    columns_left = ranked_features[1:numof_columns_left]
    
    eliminated_columns = setdiff((length(ranked_features) - total_numof_eliminated_vars), numof_columns_left)
    eliminated_columns = ranked_features[eliminated_columns]
    
    # update total_numof_eliminated_vars
    total_numof_eliminated_vars = length(ranked_features) - length(columns_left)
    
    return(list(columns_left,total_numof_eliminated_vars,h,eliminated_columns))
 }

Generate Unlabeled Data Pool

Latin hyper cube sampling

In [16]:
if(GenerateTTData == 1){
    unlabeled_pool = as.data.table(maximinLHS(n = unlabeled_ins, k = nofparams, dup = 5))
    
    unlabeled_pool$V1 = qunif(unlabeled_pool$V1, 10, 90) 
    unlabeled_pool$V2 = qunif(unlabeled_pool$V2, 10, 90) 
    setnames(unlabeled_pool, c(paste0("V",1:nofparams)), feature_names)
    
    unlabeled_pool[,idx := 1:.N]
        
    fwrite(unlabeled_pool, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_",Sys.Date(),".csv"))
}else{
    unlabeled_pool <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_04122019.csv")   
    unlabeled_pool <- head(unlabeled_pool[`%-similar-wanted` < 90 & `density` < 90],700) 
}
In [17]:
pca_unlabeled_pool <- princomp(unlabeled_pool[,-c("idx")], cor = TRUE, scores = TRUE)
pca_unlabeled_pool_components <- get_pca_ind(pca_unlabeled_pool)
p_unlabeled_pool <- ggplot(data = data.table(pca_unlabeled_pool_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
                    geom_point() +
                    labs( title = "") 
p_unlabeled_pool

Generate Test Set

In [18]:
if(GenerateTTData == 1){
    test_set <- head(unlabeled_pool,test_ins)
    
    ################## Buraya variale'ların datatipine göre bir şeyler yazılabilir
    test_set$density            = runif(test_ins, 10, 90) 
    test_set$`%-similar-wanted` = runif(test_ins, 10, 90) 
    test_set[,c("idx"):= NULL]
       
    print(paste0("ABM run start time : ",Sys.time()))
    test_set = run_ABM(nofrep,test_ins,test_set) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(test_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_",Sys.Date(),".csv"))
}else{
    test_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_04122019.csv")  
    #below part is only for this .csv
    test_set <- head(test_set[`%-similar-wanted` < 90],800) 
    
    test_set[,idx := 1:.N]    
    test_set_idx = sample(test_set$idx, test_ins, replace = FALSE, prob = NULL)   
    test_set = test_set[idx %in% test_set_idx]
    test_set[,idx:= NULL]
}

10 10 ~ 1 min 100 10 ~ 14 min 900 * 10 ~ 09:16 -- 2019-12-03 07:54:10 +03"

In [19]:
pca_test_set <- princomp(test_set, cor = TRUE, scores = TRUE)
pca_test_set_components <- get_pca_ind(pca_test_set)
p_test_set <- ggplot(data = data.table(pca_test_set_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
                    geom_point() +
                    labs( title = "") 
p_test_set

Benchmark : One-shot sampling, No feature elimination

Generate Training Set

Select a very big data pool ( nofinstances should be very high ) , like 1000

In [20]:
if(GenerateTTData == 1){
    training_set = as.data.table(maximinLHS(n = train_ins_oneshot, k = nofparams, dup = 5))
    
    training_set$V1 = qunif(training_set$V1, 10, 90) 
    training_set$V2 = qunif(training_set$V2, 10, 90) 
    setnames(training_set, c(paste0("V",1:nofparams)), feature_names)
    
    training_set$output <- 0.00
    
    print(paste0("ABM run start time : ",Sys.time()))
    training_set = run_ABM(nofrep,train_ins_oneshot,LHSample) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))  
    
    fwrite(training_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/training_set_",Sys.Date(),".csv"))
    
}else{
    training_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Data_04122019.csv")
    training_set <- head(training_set[`%-similar-wanted` < 90],700)
}
In [21]:
one_shot_data = copy(training_set)

Visualization

In [22]:
pca_training_set <- princomp(training_set[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_training_set_components <- get_pca_ind(pca_training_set)
pca_training_set_components <-cbind(pca_training_set_components$coord[,1:2],training_set[,.SD, .SDcols = c("output")])
p_training_set <- ggplot(data = pca_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_training_set

Train & Test Metamodel

In [23]:
model_oneshot <- randomForest(x = training_set[, -c("output")], y = training_set$output, importance = TRUE,ntree = ntree, mtry = mtry)
model_oneshot
Call:
 randomForest(x = training_set[, -c("output")], y = training_set$output,      ntree = ntree, mtry = mtry, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 400
No. of variables tried at each split: 2

          Mean of squared residuals: 9.100652
                    % Var explained: 96.98
In [24]:
obb_error_oneshot <- obb_error_func(model_oneshot)
In [ ]:
#OBB_pred = cbind(training_set$output,model_oneshot$predicted)
#names(OBB_pred) <- c("actual","predicted")
In [25]:
plot(model_oneshot$mse, type="l")
In [26]:
test_prediction_oneshot = get_test_predictions(model_oneshot,test_set,error_type)
predictedLabels_oneshot = test_prediction_oneshot[[1]]

performance_table_oneshot = data.table(iter = numeric(), mae= numeric(),rmse= numeric(), mape = numeric())
# Keep test set error records
performance_table_oneshot = rbind(performance_table_oneshot, data.table(1, test_prediction_oneshot[[2]]), use.names = FALSE)

output_variables = test_prediction_oneshot[[3]]
In [27]:
performance_table_oneshot
obb_error_oneshot
head(predictedLabels_oneshot)
A data.table: 1 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
10.89524352.4116261.248689
9.10065150263321
A data.table: 6 × 5
density%-similar-wantedoutputpred_outputRMSE
<dbl><dbl><dbl><dbl><dbl>
89.3902663.8771198.2515797.757130.494433300
39.2718236.4603986.9769386.347740.629181238
30.1745557.2362998.3517998.459030.107241895
28.1221068.0556499.8650499.862650.002391377
80.0970142.1845582.7019183.975431.273523215
67.6895515.8121254.6247354.364980.259752578
In [28]:
p_oneshot <- ggplot(predictedLabels_oneshot,aes(x = get(output_variables[1]), y = get(output_variables[2]), color = (get(output_variables[2]) - get(output_variables[1])))) +
            geom_point() +
            geom_abline() +
            xlab("actual values") +
            ylab("fitted values")

p_oneshot

Random Sampling & No Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [29]:
if(GenerateTTData == 1){
   
    training_set_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
    
    training_set_Ad$V1 = qunif(training_set_Ad$V1, 10, 90) 
    training_set_Ad$V2 = qunif(training_set_Ad$V2, 10, 90) 
    setnames(training_set_Ad, c(paste0("V",1:nofparams)), feature_names)
    training_set_Ad$output <- 0.00
    
    print(paste0("ABM run start time : ",Sys.time()))
    training_set_Ad = run_ABM(nofrep,train_ins_Ad,training_set_Ad) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(training_set_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))

}else{
    training_set_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
    training_set_Ad <- head(training_set_Ad[`%-similar-wanted` < 90  & `density` < 90],200)

}
In [30]:
adaptive_initial_data = copy(training_set_Ad)

Train & Test Metamodel

In [31]:
# Decide on strategy:
#iteration_budget = 3   #specified above

## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
In [32]:
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){   
    print(iter)

    trainx = training_set_Ad[,.SD, .SDcols = feature_names]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_",iter),model_Sub)
                     
    obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)
    
    # test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))    
    predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)    

    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
        
    ## sample selection from unlabeled data select candidates
    unlabeled_set <- copy(unlabeled_pool)
    train_candidates = random_sample_selection(selected_ins,unlabeled_set)
        
    # Eliminate train candidates from the unlabeled pool
    unlabeled_pool = unlabeled_pool[- train_candidates$idx]
    rm(unlabeled_set)
    
    # run ABM to find outputs of train candidates
    print(paste0("ABM train_candidate run start time : ",Sys.time()))
    train_candidates = run_ABM(nofrep,selected_ins,train_candidates)
    print(paste0("ABM train_candidate run end time : ",Sys.time()))
    
    train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))

    # Add new data to train data
    training_set_Ad = rbind(training_set_Ad,train_candidates[,-c("idx")])
    }
    iter = iter + 1
}

# plot koy her iteration'da göstersin.
#setcolorder(data,variableorder) ################# bunu bi yerlere koyman gerekebilir, dikkat!!
print(paste0("section end time : ",Sys.time()))
[1] 1
[1] "ABM train_candidate run start time : 2020-01-08 20:13:23"
[1] "ABM train_candidate run end time : 2020-01-08 20:14:18"
[1] 2
[1] "ABM train_candidate run start time : 2020-01-08 20:14:19"
[1] "ABM train_candidate run end time : 2020-01-08 20:15:37"
[1] 3
[1] "ABM train_candidate run start time : 2020-01-08 20:15:38"
[1] "ABM train_candidate run end time : 2020-01-08 20:16:10"
[1] 4
[1] "ABM train_candidate run start time : 2020-01-08 20:16:10"
[1] "ABM train_candidate run end time : 2020-01-08 20:16:55"
[1] 5
[1] "ABM train_candidate run start time : 2020-01-08 20:16:55"
[1] "ABM train_candidate run end time : 2020-01-08 20:18:44"
[1] 6
[1] "ABM train_candidate run start time : 2020-01-08 20:18:44"
[1] "ABM train_candidate run end time : 2020-01-08 20:20:43"
[1] 7
[1] "ABM train_candidate run start time : 2020-01-08 20:20:43"
[1] "ABM train_candidate run end time : 2020-01-08 20:21:47"
[1] 8
[1] "ABM train_candidate run start time : 2020-01-08 20:21:47"
[1] "ABM train_candidate run end time : 2020-01-08 20:22:19"
[1] 9
[1] "ABM train_candidate run start time : 2020-01-08 20:22:19"
[1] "ABM train_candidate run end time : 2020-01-08 20:22:46"
[1] 10
[1] "ABM train_candidate run start time : 2020-01-08 20:22:46"
[1] "ABM train_candidate run end time : 2020-01-08 20:23:49"
[1] 11
[1] "ABM train_candidate run start time : 2020-01-08 20:23:49"
[1] "ABM train_candidate run end time : 2020-01-08 20:24:13"
[1] 12
[1] "ABM train_candidate run start time : 2020-01-08 20:24:14"
[1] "ABM train_candidate run end time : 2020-01-08 20:25:20"
[1] 13
[1] "ABM train_candidate run start time : 2020-01-08 20:25:20"
[1] "ABM train_candidate run end time : 2020-01-08 20:26:33"
[1] 14
[1] "ABM train_candidate run start time : 2020-01-08 20:26:33"
[1] "ABM train_candidate run end time : 2020-01-08 20:26:55"
[1] 15
[1] "ABM train_candidate run start time : 2020-01-08 20:26:55"
[1] "ABM train_candidate run end time : 2020-01-08 20:27:58"
[1] 16
[1] "ABM train_candidate run start time : 2020-01-08 20:27:59"
[1] "ABM train_candidate run end time : 2020-01-08 20:28:21"
[1] 17
[1] "ABM train_candidate run start time : 2020-01-08 20:28:21"
[1] "ABM train_candidate run end time : 2020-01-08 20:29:28"
[1] 18
[1] "ABM train_candidate run start time : 2020-01-08 20:29:28"
[1] "ABM train_candidate run end time : 2020-01-08 20:30:46"
[1] 19
[1] "ABM train_candidate run start time : 2020-01-08 20:30:46"
[1] "ABM train_candidate run end time : 2020-01-08 20:31:12"
[1] 20

started : 2020-01-08 20:13:23 // ended : 2020-01-08 20:31:12 // 10 nofrep 5 sample 19 selection iter = 950 runs

In [33]:
# Final records
FinalTrainData_Rd = copy(training_set_Ad)
performance_table_Rd = copy(performance_table)
train_candidates_table_Rd  = copy(train_candidates_table)
predictedLabels_table_Rd = copy(predictedLabels_table)
obb_error_Rd = copy(obb_error)
In [35]:
#fwrite(FinalTrainData_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Rd_BasicCode_",Sys.Date(),".csv") )
#fwrite(performance_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Rd_BasicCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Rd_BasicCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Rdd_BasicCode_",Sys.Date(),".csv") )
#fwrite(obb_error_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Rd_BasicCode_",Sys.Date(),".csv") )
In [34]:
# show results
nrow(FinalTrainData_Rd)
performance_table_Rd 
train_candidates_table_Rd  
head(predictedLabels_table_Rd)
obb_error_Rd
295
A data.table: 20 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
11.9790574.6309512.892140
21.9561424.5664502.842396
31.9310754.5514892.807515
41.8800984.5091832.745856
51.8189184.4429632.652534
61.8402654.4791812.686348
71.8151434.3844542.655121
81.7652954.3401632.572454
91.8395894.4654412.688164
101.7778554.3691822.591287
111.8082104.4061842.630731
121.7930104.3842072.611763
131.7708574.3900752.595782
141.7086684.3066372.489076
151.7080684.2775942.489050
161.6537904.2463182.408060
171.6850954.2775752.459884
181.6675274.2698242.432956
191.6643384.2797992.434972
201.6423674.2344452.402899
A data.table: 95 × 5
density%-similar-wantedidxoutputiter
<dbl><dbl><int><dbl><dbl>
42.7781253.7596827497.114031
78.7184222.1186130560.224351
34.3545959.9310031997.943751
71.0139610.8369966152.595281
46.8770483.2746569053.828231
84.8393749.62663 2386.995132
61.3530375.3873717358.263522
42.2483144.0891520190.508092
12.5689424.5521236887.043682
81.4658868.2242361199.535282
33.2758949.22701 6491.407083
24.2790633.6883032391.819923
27.3163610.9168038169.277643
46.0782453.1911554096.561193
51.5434634.7011368382.553833
34.6003572.34013 5899.834844
47.2192269.72505 7999.643414
16.2287754.7171112999.511344
78.9714325.3704322764.334654
77.9436532.2209551672.501344
14.5120952.86507 399.693225
13.6979016.2113024985.043995
42.0601012.3207648359.112185
56.6587642.5764652086.452165
85.8448488.2098952750.103145
83.0640879.4220611652.400236
41.8380778.6731016779.521316
76.9274612.8828828652.242386
18.7972379.6772645494.039606
86.4760554.0109364394.019356
...............
15.1394474.06980 3100.0000014
17.8470321.11220 29 79.3908314
75.0797952.73906 44 93.9868214
21.6072685.95121378 75.2873314
38.5599411.12222614 61.5464814
52.3482346.21328 31 88.9077315
56.3512212.24678251 54.6864915
63.2360284.93076338 51.4673415
15.3217225.76877428 88.2410715
49.3455469.43317625 99.7261715
28.9798835.44827144 90.0233816
32.0867213.03629303 65.3974716
37.8216727.35264432 76.8170216
20.9237158.18562494 99.2303616
25.8648651.16387546 98.8403116
37.2722818.42808 91 63.2099017
20.9848480.97271160 80.7359617
45.3875135.70500249 84.1898217
88.8034441.15310467 82.6283117
38.8083766.89394588 99.7924917
41.9704955.20956 3 97.1812318
36.3755837.53732136 86.6314118
26.1583427.99602176 80.0068118
64.0423569.77663383 99.6475118
73.4716682.75609483 52.1500318
34.2486864.31899 57 98.9152919
56.8826964.81723384 98.5185719
53.5001531.99233394 73.9148319
34.8906514.04983484 63.4055119
30.2952248.18946576 91.7866619
A data.table: 6 × 43
density%-similar-wantedoutputpred_output_1RMSE_1pred_output_2RMSE_2pred_output_3RMSE_3pred_output_4...pred_output_16RMSE_16pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.3902663.8771198.2515798.064020.18754528298.081320.17024439698.087470.1640983498.06799...98.014150.2374183998.018410.23315600798.042510.20905315797.998760.25280669998.008400.24316723
39.2718236.4603986.9769384.971062.00586921484.900122.07680543985.100501.8764277285.64263...86.085050.8918729486.560310.41661298087.007840.03091906686.629330.34759674486.627360.34956716
30.1745557.2362998.3517998.485190.13339482798.447110.09531249798.507910.1561205298.41732...98.478990.1271982598.496180.14438939798.530980.17919123198.447220.09542577698.511250.15946007
28.1221068.0556499.8650499.867350.00230282199.874640.00959901399.876500.0114550799.86329...99.868140.0030961299.859510.00552937499.867850.00280286699.867400.00235386299.878380.01333793
80.0970142.1845582.7019185.346282.64437026185.218362.51644576385.307942.6060253385.42648...85.898053.1961422585.752233.05032358685.507402.80549333085.671412.96950180585.609292.90737641
67.6895515.8121254.6247357.182312.55757802857.207842.58310736256.901282.2765480956.99056...56.625372.0006398856.297901.67316359756.390571.76583782256.536871.91214045156.428421.80369009
A data.table: 20 × 2
iterobb_error
<dbl><dbl>
113.698624
212.575650
311.546581
410.946875
510.486905
6 9.954476
7 9.179074
8 8.386098
9 8.878509
10 8.073575
11 8.115914
12 7.367993
13 7.798472
14 6.945190
15 7.121243
16 6.084151
17 6.817448
18 5.652835
19 6.150030
20 5.816591
In [36]:
performance_molten_Rd <- melt(data = performance_table_Rd
                             , id.vars = 'iter')
setnames(performance_molten_Rd, c("variable","value"),c("errortype","errorvalue"))
p_Rd = ggplot(performance_molten_Rd, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
          geom_line(lwd=1)
p_Rd

Adaptive Sampling & No Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [37]:
training_set_Ad = copy(adaptive_initial_data)
In [ ]:
#if(GenerateTTData == 1){
#   
#    LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#    
#    LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90) 
#    LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90) 
#    setnames(LHSample_Ad, c("V1","V2"), feature_names)
#    LHSample_Ad$output <- 0.00
#    
#    paste0("ABM run start time : ",Sys.time())
#    LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
#    paste0("ABM run end time : ",Sys.time())
#    
#    fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
#    LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
#    LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90  & `density` < 90],200)
#
#}

Visualization

In [38]:
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
In [39]:
#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
                     geom_point(aes(colour = output)) +
                     labs( title = "", legend = "output") 
p_training_set_Ad

Train & Test Metamodel

In [40]:
# Decide on strategy:
#iteration_budget = 3

#h = 1 # specify how many variable will be eliminated in each elimination iteration
In [41]:
## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
In [42]:
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){   
    print(iter)

    trainx = training_set_Ad[,.SD, .SDcols = feature_names]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_",iter),model_Sub)
                    
    obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)

    # test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))    
    predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)
    
    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.    
    ## sample selection from unlabeled data select candidates
        unlabeled_set <- copy(unlabeled_pool)
        train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
        
        # eliminate candidates from the unlabeled pool
        unlabeled_pool = unlabeled_pool[-train_candidates$idx]
        rm(unlabeled_set)
        
        # run ABM to find outputs of train candidates
        print(paste0("ABM train_candidate run start time : ",Sys.time()))
        train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
        print(paste0("ABM train_candidate run end time : ",Sys.time()))
        
        train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
        
        # add labeled candidates to the train data
        training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
    }
    iter = iter + 1
}
print(paste0("section end time : ",Sys.time()))
[1] "2020-01-08 20:43:07 +03"
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
In [43]:
# Final records
FinalTrainData_Ad = copy(training_set_Ad)
performance_table_Ad = copy(performance_table)
train_candidates_table_Ad  = copy(train_candidates_table)
predictedLabels_table_Ad = copy(predictedLabels_table)
obb_error_Ad = copy(obb_error)
In [44]:
#fwrite(FinalTrainData_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(performance_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(obb_error_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Ad_BasicCode_",Sys.Date(),".csv") )
In [45]:
nrow(FinalTrainData_Ad)
performance_table_Ad
train_candidates_table_Ad
head(predictedLabels_table_Ad)
obb_error_Ad
295
A data.table: 20 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
11.9531074.5507092.851815
21.8752824.4664532.785134
31.7262524.2105742.507212
41.7035104.1837902.467414
51.7362354.2066432.520176
61.6599614.0866852.431520
71.6289214.0646372.388690
81.5607163.9036072.298264
91.5642093.9180142.308238
101.5507333.9219762.281406
111.5086003.8880822.221079
121.4945173.8607782.194536
131.5294493.8834192.248281
141.4809853.8405142.181458
151.4786013.8788812.183937
161.4553493.8616962.151844
171.4149073.8482122.096413
181.3752673.8208532.036647
191.3954943.8323822.070172
201.3874283.8225722.063110
A data.table: 95 × 5
density%-similar-wantedidxoutputiter
<dbl><dbl><int><dbl><dbl>
52.5150274.9953511699.747721
66.3243474.8561316899.753181
79.1863874.5911444481.755181
38.0483974.8949723799.810411
17.7986875.8354121098.806361
49.5289675.20317 1268.802202
57.4877475.7097827260.828922
63.6988276.0875023159.070342
54.7581076.5828726264.390712
70.4786176.76088 4955.411882
22.2315689.1719447471.969503
23.9634387.89434 6770.150553
23.1629282.2857539771.027023
25.2338289.5988229663.582993
25.5551082.8926911966.410173
18.6606886.56811 3388.446464
20.3701275.1407844394.162834
32.1509676.2694838882.252914
41.2925475.9178053473.042684
19.7934883.2254855184.459964
75.2477475.3334151753.865215
33.8388579.3084615382.540225
14.1575913.4271246184.207435
12.2721812.10815 1687.784435
17.2993910.1668658179.458715
29.0856880.7789314360.627956
17.9556888.8707357992.265856
44.2697579.5329348873.042316
35.4304277.5368453280.362476
29.2614878.0199842283.833296
...............
89.3762867.27177180 98.1181514
89.0176256.11732338 94.6440514
22.6192914.09972175 73.3314714
89.6756552.96523311 94.1110114
28.5884783.86143374 62.1055414
22.8865225.02460 37 81.9591815
75.6221428.38348275 65.3826115
63.8265179.28610178 58.2794315
67.3908272.71155109 99.6498015
22.5290127.04430304 82.0277815
25.3066225.56910457 80.6532816
87.2756128.73142339 73.6473516
33.5934332.95049262 78.9835016
37.3896733.93330 20 86.7338116
40.0936913.92735139 60.9759916
28.0811525.41423215 80.0030517
31.7605126.41215372 78.8019417
77.5987877.15022264 53.1650317
35.0228027.87709192 77.9415117
49.5054989.36927220 51.8924417
26.0784577.40225233 85.5688818
16.9440985.47096329 98.2174118
43.3866722.75769 10 66.9297718
11.6295688.38036130100.0000018
21.9377016.78390487 76.0364418
40.0538625.70328184 76.0399519
24.9680574.65329430 99.9273719
45.9813781.18073 77 54.4648319
35.3261524.68971511 68.6997819
31.9878223.78817 95 71.0799919
A data.table: 6 × 43
density%-similar-wantedoutputpred_output_1RMSE_1pred_output_2RMSE_2pred_output_3RMSE_3pred_output_4...pred_output_16RMSE_16pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.3902663.8771198.2515797.967000.2845687897.934610.3169552097.891810.3597585997.89960...97.668390.5831736397.919830.3317410597.858280.3932837497.804210.4473546897.760810.49075851
39.2718236.4603986.9769384.813022.1639028984.804842.1720856084.999381.9775489184.73263...86.134670.8422505786.275770.7011521186.352560.6243627686.336600.6403238886.153620.82330789
30.1745557.2362998.3517998.442410.0906183398.468820.1170231698.512460.1606710198.44729...98.464620.1128284798.431490.0797011998.468320.1165304898.395670.0438747998.479000.12720653
28.1221068.0556499.8650499.879470.0144281199.886980.0219358299.889120.0240743499.88278...99.881940.0169006299.882210.0171662599.881180.0161401699.877550.0125093199.877180.01213288
80.0970142.1845582.7019185.216902.5149858685.332702.6307870085.355352.6534351285.21777...85.314772.6128554285.157252.4553357985.233292.5313758885.114452.4125346985.361552.65964143
67.6895515.8121254.6247357.264082.6393466057.239172.6144352557.033672.4089364556.92696...56.686322.0615856356.351831.7271001656.474591.8498628656.436821.8120889556.513621.88888368
A data.table: 20 × 2
iterobb_error
<dbl><dbl>
113.023629
220.357446
312.220124
411.424621
512.597000
612.687486
712.313131
812.657531
912.398615
1011.619570
1112.149426
1210.130318
13 9.809643
14 9.592774
1510.050909
16 9.763462
17 8.740436
18 9.116479
19 8.536941
20 8.507114
In [46]:
performance_molten_Ad <- melt(data = performance_table_Ad
                             , id.vars = 'iter')
setnames(performance_molten_Ad, c("variable","value"),c("errortype","errorvalue"))
p_Ad = ggplot(performance_molten_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1)
p_Ad

Random Sampling vs Uncertainty Sampling

In [47]:
grid.arrange(p_Rd, p_Ad, ncol=2)
In [93]:
performance_molten_oneshot <- melt(data = performance_table_oneshot
                             , id.vars = 'iter')
setnames(performance_molten_oneshot, c("variable","value"),c("errortype","errorvalue"))

performance_Rd_vs_Ad = rbind(performance_molten_Rd[,.(iter,errortype,errorvalue, type = "Rd")],performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")])
p_Rd_vs_Ad = ggplot(performance_Rd_vs_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~type)
p_Rd_vs_Ad
In [94]:
ggplotly(p_Rd_vs_Ad)
In [98]:
comp = performance_Rd_vs_Ad[iter == 20 & errortype =="rmse"]
comp[, oneshot_error := performance_molten_oneshot[errortype =="rmse"]$errorvalue]      
comp[,diff := (errorvalue - oneshot_error) ]
comp[,diff_perc := (errorvalue - oneshot_error) / oneshot_error ]
comp
A data.table: 2 × 7
itererrortypeerrorvaluetypeoneshot_errordiffdiff_perc
<dbl><fct><dbl><chr><dbl><dbl><dbl>
20rmse4.234445Rd2.4116261.8228190.7558461
20rmse3.822572Ad2.4116261.4109450.5850595

Adaptive Sampling & Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [48]:
training_set_Ad = copy(adaptive_initial_data)
In [ ]:
#if(GenerateTTData == 1){
#   
#    LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#    
#    LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90) 
#    LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90) 
#    setnames(LHSample_Ad, c("V1","V2"), feature_names)
#    LHSample_Ad$output <- 0.00
#    
#    paste0("ABM run start time : ",Sys.time())
#    LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
#    paste0("ABM run end time : ",Sys.time())
#    
#    fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
#    LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
#    LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90  & `density` < 90],200)
#
#}

Visualization

In [49]:
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
In [ ]:
#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
                     geom_point(aes(colour = output)) +
                     labs( title = "", legend = "output") 
p_training_set_Ad

Train and Test Metamodel

In [51]:
# Decide on strategy:
sample_selection_iteration_order = c(1:19)
feature_elimination_iteration_order = c(19)
#iteration_budget = 3 # should be > max(max(sample_selection_iteration_order),max(feature_elimination_iteration_order))

#h = 1 # specify how many variable will be eliminated in each elimination iteration
In [50]:
feature_names
  1. 'density'
  2. '%-similar-wanted'
In [52]:
## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

# Record iteration history
iteration_history = data.table(iter_no = numeric(), IsFeatureEliminated = logical(), IsDataSelected = logical())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)

# specify variables(columns) to be used initialize
columns_left = feature_names
total_numof_eliminated_vars <- 0
In [53]:
print(paste0("section start time : ",Sys.time()))
iter = 1
while (iter <= iteration_budget) {
    
    trainx = training_set_Ad[, .SD, .SDcols = columns_left]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest(x = trainx, y = trainy, importance = TRUE, ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_", iter), model_Sub)
    
    if (length(columns_left) == length(feature_names)) {
        ranked_features = get_variable_importance(model_Sub)
    }
    # Keep training set error records
    obb_error = rbind(obb_error, data.table(iter, obb_error_func(model_Sub)), use.names = FALSE)
    
    # Test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub, test_set, error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_", iter), paste0(error_type, "_", iter)))
    predictedLabels_table = cbind(predictedLabels_table, predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_", iter), paste0(error_type, "_", iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table, data.table(iter, test_predictions_Sub[[2]]), use.names = FALSE)
    
    # update iteration_history
    iteration_history = rbind(iteration_history, data.table(iter, 0, 0), use.names = FALSE)
    
    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
          if (iter %in% sample_selection_iteration_order) {
              ## sample selection from unlabeled data select candidates
              unlabeled_set <- copy(unlabeled_pool)
              train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
              
              # eliminate candidates from the unlabeled pool
              unlabeled_pool = unlabeled_pool[-train_candidates$idx]
              rm(unlabeled_set)
              
              # run ABM to find outputs of train candidates
              print(paste0("ABM train_candidate run start time : ",Sys.time()))
              train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
              print(paste0("ABM train_candidate run end time : ",Sys.time()))
              
              train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
              
              # add labeled candidates to the train data
              training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
              
              # update iteration_history
               iteration_history[iter]$IsDataSelected= 1
          }
          if (iter %in% feature_elimination_iteration_order) {
              ## feature elimination apply feature elimination
              feature_elimination_result = feature_elimination(h, total_numof_eliminated_vars, ranked_features)
              
              columns_left = feature_elimination_result[[1]]  # 
              eliminated_columns = feature_elimination_result[[4]]  #   not necessary
              total_numof_eliminated_vars = as.numeric(feature_elimination_result[2])
              numof_eliminated_vars = as.numeric(feature_elimination_result[3])  #   not necessary 
              
              # update iteration_history
              iteration_history[iter]$IsFeatureEliminated= 1
          }
    }
iter = iter + 1  
}
print(paste0("section end time : ",Sys.time()))
[1] "section start time : 2020-01-08 21:46:52"
[1] "ABM train_candidate run start time : 2020-01-08 21:46:52"
[1] "ABM train_candidate run end time : 2020-01-08 21:49:56"
[1] "ABM train_candidate run start time : 2020-01-08 21:49:56"
[1] "ABM train_candidate run end time : 2020-01-08 21:50:19"
[1] "ABM train_candidate run start time : 2020-01-08 21:50:19"
[1] "ABM train_candidate run end time : 2020-01-08 21:50:37"
[1] "ABM train_candidate run start time : 2020-01-08 21:50:38"
[1] "ABM train_candidate run end time : 2020-01-08 21:51:46"
[1] "ABM train_candidate run start time : 2020-01-08 21:51:46"
[1] "ABM train_candidate run end time : 2020-01-08 21:52:08"
[1] "ABM train_candidate run start time : 2020-01-08 21:52:08"
[1] "ABM train_candidate run end time : 2020-01-08 21:53:04"
[1] "ABM train_candidate run start time : 2020-01-08 21:53:04"
[1] "ABM train_candidate run end time : 2020-01-08 21:53:29"
[1] "ABM train_candidate run start time : 2020-01-08 21:53:29"
[1] "ABM train_candidate run end time : 2020-01-08 21:54:24"
[1] "ABM train_candidate run start time : 2020-01-08 21:54:25"
[1] "ABM train_candidate run end time : 2020-01-08 21:54:57"
[1] "ABM train_candidate run start time : 2020-01-08 21:54:58"
[1] "ABM train_candidate run end time : 2020-01-08 21:56:23"
[1] "ABM train_candidate run start time : 2020-01-08 21:56:23"
[1] "ABM train_candidate run end time : 2020-01-08 21:56:53"
[1] "ABM train_candidate run start time : 2020-01-08 21:56:53"
[1] "ABM train_candidate run end time : 2020-01-08 21:57:47"
[1] "ABM train_candidate run start time : 2020-01-08 21:57:47"
[1] "ABM train_candidate run end time : 2020-01-08 21:58:14"
[1] "ABM train_candidate run start time : 2020-01-08 21:58:14"
[1] "ABM train_candidate run end time : 2020-01-08 21:59:14"
[1] "ABM train_candidate run start time : 2020-01-08 21:59:14"
[1] "ABM train_candidate run end time : 2020-01-08 21:59:41"
[1] "ABM train_candidate run start time : 2020-01-08 21:59:41"
[1] "ABM train_candidate run end time : 2020-01-08 22:00:50"
[1] "ABM train_candidate run start time : 2020-01-08 22:00:50"
[1] "ABM train_candidate run end time : 2020-01-08 22:01:15"
[1] "ABM train_candidate run start time : 2020-01-08 22:01:15"
[1] "ABM train_candidate run end time : 2020-01-08 22:01:37"
[1] "ABM train_candidate run start time : 2020-01-08 22:01:37"
[1] "ABM train_candidate run end time : 2020-01-08 22:02:30"
[1] "section end time : 2020-01-08 22:02:30"

950 runs: "section start time : 2020-01-08 21:46:52" // "section end time : 2020-01-08 22:02:30"

In [68]:
#performance_error_table = performance_table[,.SD,.SDcols = c("iter",tolower(error_type))]
#setnames(performance_error_table,c("iter","error"))
#performance_error_table[, lag_error := shift(error,1,type = "lag")]
#performance_error_table
performance_error_table[.N]
In [74]:
columns_left
'%-similar-wanted'
In [54]:
# Final records
FinalTrainData_AdFe = copy(training_set_Ad)
performance_table_AdFe = copy(performance_table)
train_candidates_table_AdFe  = copy(train_candidates_table)
predictedLabels_table_AdFe = copy(predictedLabels_table)
obb_error_AdFe = copy(obb_error)
In [55]:
#fwrite(FinalTrainData_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(performance_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(obb_error_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_AdFe_BasicCode_",Sys.Date(),".csv") )
In [56]:
nrow(FinalTrainData_AdFe)
performance_table_AdFe
train_candidates_table_AdFe
head(predictedLabels_table_AdFe)
obb_error_AdFe
295
A data.table: 20 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
11.976288 4.5620802.890032
21.847960 4.3958942.633541
31.770368 4.2858612.544732
41.760720 4.2386482.533905
51.751490 4.2507672.516850
61.671947 4.1838032.425678
71.663493 4.1973962.421337
81.657437 4.2166532.414149
91.648544 4.1984162.404775
101.620069 4.1876992.356469
111.610693 4.1785892.341866
121.595499 4.2169972.320202
131.579449 4.1604912.301436
141.583705 4.1975752.308353
151.558521 4.1686302.268622
161.582896 4.1964282.309359
171.568282 4.1947502.286266
181.556581 4.1464072.253243
191.526085 4.1450892.202216
206.35927510.5260889.407042
A data.table: 95 × 5
density%-similar-wantedidxoutputiter
<dbl><dbl><int><dbl><dbl>
82.7606876.43015114 52.743001
49.0230977.00174154 65.975061
72.4103876.23947158 54.442361
66.0970977.64091423 56.791191
89.2501176.02308330 51.845751
28.2622589.73334477 60.919722
27.8200886.77187241 62.009852
34.4462274.35828239 99.825552
17.2823513.79890273 79.896922
20.2222313.09989383 75.815252
11.4662983.46279230100.000003
10.7511580.23808418100.000003
13.7996180.14878241100.000003
15.7565577.87148467100.000003
13.9742387.50010397100.000003
40.9477382.62391 88 55.477374
40.1951084.74781388 53.758304
37.9011183.14418195 56.243234
10.0136086.11170454100.000004
13.4204136.25440224 94.943344
11.8860434.09181 70 95.903455
33.9789436.86119121 88.587205
35.6625335.19057385 87.798745
44.7913713.29886197 58.664385
10.9336014.29982214 91.987725
80.5453471.36948236 99.477936
50.2000773.33066363 99.755286
32.4994719.08426118 66.661416
10.8419231.72772408 93.318436
10.3939317.20636457 89.599156
...............
45.2581622.80078270 65.8249714
56.0114126.68313 38 70.3290714
54.1915227.61548297 71.4047414
48.0163622.27623184 65.6326114
72.0387478.84159359 54.4666714
81.9387928.89621 70 72.7382815
47.9250124.76929101 65.8041115
52.1575618.57792 98 59.3456415
51.1435139.78230380 82.6910415
11.0998527.43652355 89.9891515
22.4899730.27819430 81.8151116
46.4174519.75762412 60.0032516
42.6889919.55907296 61.6441116
41.1140531.33199272 76.6489016
69.0707978.26982242 55.3490216
87.0358821.33651103 57.1088917
24.7532439.41073220 91.8375717
73.0407026.74639341 65.6574517
84.0774121.96245 87 58.1505317
42.5167934.39959389 85.4894817
20.6984071.44970280 99.9214618
88.5286823.05890 77 57.8704618
20.1300467.90056321100.0000018
74.9683522.05082106 61.0121018
12.4914129.81086102 91.1144118
19.8482529.66990108 83.7865119
12.6781777.47627231100.0000019
81.1026723.74303362 58.8125119
25.0885753.22702 17 98.6239819
62.3076933.14994179 73.1387019
A data.table: 6 × 43
density%-similar-wantedoutputpred_output_1RMSE_1pred_output_2RMSE_2pred_output_3RMSE_3pred_output_4...pred_output_16RMSE_16pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.3902663.8771198.2515798.064080.187490785497.901240.3503264697.558960.69261142397.71552...97.911130.34043500197.805970.44559574197.747240.504325715497.975890.2756737298.941480.68991317
39.2718236.4603986.9769384.776772.200152745884.673402.3035236784.805302.17162779884.59319...86.612330.36459486786.939590.03733269086.582220.394704805386.490560.4863697991.161484.18455238
30.1745557.2362998.3517998.462490.110700871598.448870.0970790798.451860.10006516798.48240...98.440320.08852689998.443990.09220043698.351370.000426662198.368740.0169429797.481380.87041492
28.1221068.0556499.8650499.864570.000475419199.875400.0103614899.871330.00628454699.86404...99.861700.00334002999.859420.00561810499.847240.017799382899.846860.0181874599.860230.00481316
80.0970142.1845582.7019185.227542.525628692585.297442.5955311185.343972.64205649585.39381...85.335522.63360701685.273302.57138948985.331452.629541008385.166152.4642361689.692426.99051355
67.6895515.8121254.6247357.359352.734617875257.054222.4294928256.767662.14292580456.94051...55.361790.73705857455.294470.66973457955.316370.691640185155.348890.7241628760.030255.40552035
A data.table: 20 × 2
iterobb_error
<dbl><dbl>
1 13.206488
2 10.927211
3 9.601889
4 8.278378
5 8.291517
6 7.811656
7 7.498688
8 8.064948
9 7.260196
10 6.638461
11 7.325007
12 6.559538
13 6.846907
14 6.828797
15 6.249656
16 6.488448
17 6.525023
18 6.234868
19 6.514921
20133.885355
In [72]:
iteration_history
A data.table: 20 × 3
iter_noIsFeatureEliminatedIsDataSelected
<dbl><dbl><dbl>
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1911
2000
In [73]:
performance_molten_AdFe <- melt(data = performance_table_AdFe
                             , id.vars = 'iter')
setnames(performance_molten_AdFe, c("variable","value"),c("errortype","errorvalue"))
p_AdFe = ggplot(performance_molten_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            geom_vline(xintercept = iteration_history[IsDataSelected==1]$iter_no + 1, linetype = "dotdash",color = "yellow")
p_AdFe

Adaptive Sampling with/without Feature Elimination

In [75]:
grid.arrange(p_Ad, p_AdFe, ncol=2)
In [95]:
performance_Ad_vs_AdFe = rbind(performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")], performance_molten_AdFe[,.(iter,errortype,errorvalue, type = "AdFe")])
p_Ad_vs_AdFe = ggplot(performance_Ad_vs_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~type)
p_Ad_vs_AdFe
In [80]:
ggplotly(p_Ad_vs_AdFe)
In [ ]:
#varImpPlot(model_Ad)

Quit NL

In [ ]:
NLQuit(nl.obj = nl.model)
#NLQuit(all=FALSE)